{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import RegexpTokenizer\n",
    "from stop_words import get_stop_words\n",
    "from nltk.stem.porter import PorterStemmer\n",
    "from gensim import corpora, models\n",
    "import gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tokenizer = RegexpTokenizer(r'\\w+')\n",
    "\n",
    "# create English stop words list\n",
    "en_stop = get_stop_words('en')\n",
    "\n",
    "# Create p_stemmer of class PorterStemmer\n",
    "p_stemmer = PorterStemmer()\n",
    "    \n",
    "# create sample documents\n",
    "doc_a = \"Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.\"\n",
    "doc_b = \"My mother spends a lot of time driving my brother around to baseball practice.\"\n",
    "doc_c = \"Some health experts suggest that driving may cause increased tension and blood pressure.\"\n",
    "doc_d = \"I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.\"\n",
    "doc_e = \"Health professionals say that brocolli is good for your health.\" \n",
    "\n",
    "# compile sample documents into a list\n",
    "doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]\n",
    "\n",
    "# list for tokenized documents in loop\n",
    "texts = []\n",
    "\n",
    "# loop through document list\n",
    "for i in doc_set:\n",
    "    \n",
    "    # clean and tokenize document string\n",
    "    raw = i.lower()\n",
    "    tokens = tokenizer.tokenize(raw)\n",
    "\n",
    "    # remove stop words from tokens\n",
    "    stopped_tokens = [i for i in tokens if not i in en_stop]\n",
    "    \n",
    "    # stem tokens\n",
    "    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n",
    "    \n",
    "    # add tokens to list\n",
    "    texts.append(stemmed_tokens)\n",
    "\n",
    "# turn our tokenized documents into a id <-> term dictionary\n",
    "dictionary = corpora.Dictionary(texts)\n",
    "    \n",
    "# convert tokenized documents into a document-term matrix\n",
    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
    "\n",
    "# generate LDA model\n",
    "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\" + 0.043*\"caus\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\" + 0.059*\"mother\"')]\n"
     ]
    }
   ],
   "source": [
    "print(ldamodel.print_topics(num_topics=2, num_words=4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\"')]\n"
     ]
    }
   ],
   "source": [
    "print(ldamodel.print_topics(num_topics=3, num_words=3))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
